{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training Gradient Boosting Models with CatBoost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/catboost/tutorials/blob/master/events/2020_06_04_catboost_tutorial/catboost_features.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Libraries installation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # For Google Colaboratory:\n",
    "# !pip install catboost sklearn shap\n",
    "\n",
    "# # For your machine:\n",
    "# !pip install --user -U ipywidgets catboost sklearn shap\n",
    "# !jupyter nbextension enable --py widgetsnbextension"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "np.set_printoptions(precision=4)\n",
    "\n",
    "import catboost\n",
    "print(catboost.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost.datasets import msrank_10k\n",
    "\n",
    "# If you have \"URLError: SSL: CERTIFICATE_VERIFY_FAILED\" uncomment next two lines:\n",
    "# import ssl\n",
    "# ssl._create_default_https_context = ssl._create_unverified_context\n",
    "\n",
    "(train_df, test_df) = msrank_10k()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing the data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Label values extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = train_df[0]\n",
    "X = train_df.drop([0, 1], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ways to create Pool class. If you have a big dataset it is effective (in terms of time) to load data from file, instead of pandas Dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_dir = './msrank_10k'\n",
    "if not os.path.exists(dataset_dir):\n",
    "    os.makedirs(dataset_dir)\n",
    "\n",
    "train_df.to_csv(\n",
    "    os.path.join(dataset_dir, 'train.csv'),\n",
    "    index=False, sep=',', header=True\n",
    ")\n",
    "test_df.to_csv(\n",
    "    os.path.join(dataset_dir, 'test.csv'),\n",
    "    index=False, sep=',', header=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -2 msrank_10k/train.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost.utils import create_cd\n",
    "feature_names = dict(map(lambda i: (i + 2, 'Feature ' + str(i)), range(train_df.shape[1])))\n",
    "    \n",
    "create_cd(\n",
    "    label=0,\n",
    "    feature_names=feature_names,\n",
    "    auxiliary_columns=[1],\n",
    "    output_path=os.path.join(dataset_dir, 'train.cd')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head msrank_10k/train.cd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import Pool\n",
    "\n",
    "pool1 = Pool(data=X, label=y)\n",
    "\n",
    "pool2 = Pool(\n",
    "    data=os.path.join(dataset_dir, 'train.csv'), \n",
    "    delimiter=',', \n",
    "    column_description=os.path.join(dataset_dir, 'train.cd'),\n",
    "    has_header=True,\n",
    ")\n",
    "\n",
    "print('Dataset shape: {}\\n'.format(pool1.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Split your data into train and validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "data = train_test_split(X, y, train_size=0.8, random_state=0)\n",
    "X_train, X_validation, y_train, y_validation = data\n",
    "\n",
    "train_pool = Pool(data=X_train, label=y_train)\n",
    "validation_pool = Pool(data=X_validation, label=y_validation)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset Quantization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Features quantization. It is effective to quantize features single time before several trainings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_pool.quantize(\n",
    "    border_count=254,\n",
    "    # per_float_feature_quantization=['0:border_count=1024']\n",
    ")\n",
    "\n",
    "train_pool.save_quantization_borders('borders.tsv')\n",
    "\n",
    "validation_pool.quantize(input_borders='borders.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost.utils import quantize\n",
    "\n",
    "pool2 = quantize(\n",
    "    data_path=os.path.join(dataset_dir, 'train.csv'),\n",
    "    delimiter=',',\n",
    "    column_description=os.path.join(dataset_dir, 'train.cd'),\n",
    "    has_header=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostRegressor\n",
    "\n",
    "model = CatBoostRegressor(\n",
    "    iterations=5,\n",
    "    learning_rate=0.1,\n",
    ")\n",
    "model.fit(train_pool, eval_set=validation_pool, verbose=False)\n",
    "\n",
    "print('Model is fitted: {}'.format(model.is_fitted()))\n",
    "print('Model params:\\n{}'.format(model.get_params()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stdout of the training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(\n",
    "    iterations=15,\n",
    "#     verbose=5,\n",
    ")\n",
    "model.fit(train_pool, eval_set=validation_pool);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Metrics calculation and graph plotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(\n",
    "    iterations=200,\n",
    "    learning_rate=0.2,\n",
    "    custom_metric=['MAE', 'R2']\n",
    ")\n",
    "\n",
    "model.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Best iteration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(\n",
    "    iterations=100,\n",
    "    eval_metric='MAE',\n",
    "    learning_rate=0.5,\n",
    "#     use_best_model=False\n",
    ")\n",
    "model.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Tree count: ' + str(model.tree_count_))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grid Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pool = Pool(data=X_train, label=y_train)\n",
    "model = CatBoostRegressor(iterations=10, eval_metric='MAE')\n",
    "grid = {'learning_rate': [0.001, 0.01, 0.1], 'depth': [4, 5, 6]}\n",
    "result = model.grid_search(grid, pool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Best parameters: {}\\n'.format(result['params']))\n",
    "\n",
    "msg = 'Mean MAE value on validation set per each iteration:\\n{}'\n",
    "print(msg.format(np.round(result['cv_results']['test-MAE-mean'], 4)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.predict(validation_pool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(iterations=100, eval_metric='MAE')\n",
    "model.grid_search(grid, pool, plot=True, verbose=False);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "More about parameter tuning you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/hyperparameters_tuning/hyperparameters_tuning.ipynb)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature importances"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prediction values change"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.get_feature_importance(prettified=True).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loss function change"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.get_feature_importance(\n",
    "    data=validation_pool, \n",
    "    type='LossFunctionChange',\n",
    "    prettified=True\n",
    ").head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Shap values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(iterations=1000, learning_rate=0.1)\n",
    "model.fit(\n",
    "    train_pool,\n",
    "    eval_set=validation_pool,\n",
    "    verbose=False,\n",
    "    plot=True\n",
    ");\n",
    "\n",
    "\n",
    "shap_values = model.get_feature_importance(\n",
    "    data=train_pool, \n",
    "    type='ShapValues'\n",
    ")\n",
    "\n",
    "expected_value = shap_values[0,-1]\n",
    "shap_values = shap_values[:,:-1]\n",
    "\n",
    "print(shap_values.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import shap\n",
    "\n",
    "shap.initjs()\n",
    "shap.force_plot(\n",
    "    expected_value,\n",
    "    shap_values[1,:],\n",
    "    feature_names=train_pool.get_feature_names()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shap.force_plot(\n",
    "    expected_value,\n",
    "    shap_values[7,:],\n",
    "    feature_names=train_pool.get_feature_names()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shap.summary_plot(shap_values, X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "More information about shap value usage you can find in [tutorial](https://github.com/catboost/catboost/blob/master/catboost/tutorials/model_analysis/shap_values_tutorial.ipynb)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tree Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(iterations=2, depth=1, boost_from_average=False)\n",
    "\n",
    "features = [\n",
    "    [1, 2, 3], \n",
    "    [4, 5, 6],\n",
    "    [7, 8, 9]\n",
    "]\n",
    "labels = [1, 0, 1]\n",
    "\n",
    "model.fit(features, labels, verbose=False);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This cell doesn't work without graphviz package\n",
    "# You can install it by link https://graphviz.gitlab.io/download/\n",
    "# Installation can take a lot of time. You can do it at home.\n",
    "\n",
    "from IPython.display import display\n",
    "\n",
    "display(model.plot_tree(0))\n",
    "display(model.plot_tree(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "x = [0, 7, 2]\n",
    "\n",
    "raw_pred = model.predict([x])\n",
    "print(raw_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grow Policy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(iterations=2, depth=4, grow_policy='Depthwise')\n",
    "model.fit(features, labels, verbose=False);\n",
    "display(model.plot_tree(0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(iterations=2, max_leaves=16, grow_policy='Lossguide')\n",
    "model.fit(features, labels, verbose=False);\n",
    "display(model.plot_tree(0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Snapshotting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# !rm 'catboost_info/snapshot.bkp'\n",
    "\n",
    "model = CatBoostRegressor(\n",
    "    iterations=200,\n",
    "    save_snapshot=True,\n",
    "    snapshot_file='snapshot.bkp',\n",
    "    snapshot_interval=1\n",
    ")\n",
    "\n",
    "model.fit(train_pool, eval_set=validation_pool, verbose=10);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Saving the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = CatBoostRegressor(iterations=10)\n",
    "model.fit(train_pool, eval_set=validation_pool, verbose=False)\n",
    "model.save_model('catboost_model.bin')\n",
    "model.save_model('catboost_model.json', format='json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_model('catboost_model.bin')\n",
    "print(model.get_params())\n",
    "print(model.learning_rate_)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "widgets": {
   "state": {
    "1057714ebc614324aa3ba2cf69408966": {
     "views": [
      {
       "cell_index": 17
      }
     ]
    },
    "8381e9eed05f4a03905ae8a56d7ab4ea": {
     "views": [
      {
       "cell_index": 48
      }
     ]
    },
    "f49684e8c5c44241bfe2c7f577f5cb41": {
     "views": [
      {
       "cell_index": 53
      }
     ]
    }
   },
   "version": "1.2.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
